library(tidyverse)Load and join data
Load libraries
Load data
Loading raw data
## function_file:
curl::curl_download(url = "https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Creviewed%2Cid%2Cgene_names%2Corganism_name%2Cprotein_name%2Ccc_cofactor%2Ccc_catalytic_activity%2Cph_dependence%2Ccc_pathway%2Ccc_function%2Cec%2Ctemp_dependence%2Cgo_f%2Ccc_subcellular_location%2Cft_transmem%2Cft_intramem%2Cprotein_families&format=tsv&query=%28%28taxonomy_id%3A9606%29+OR+%28taxonomy_id%3A10090%29%29+AND+%28reviewed%3Atrue%29",
destfile = "../data/_raw/function_file.tsv.gz")
## sequence_file:
curl::curl_download(url = "https://rest.uniprot.org/uniprotkb/stream?compressed=true&fields=accession%2Creviewed%2Cid%2Cgene_names%2Corganism_name%2Cprotein_name%2Clength%2Cmass%2Csequence%2Cft_act_site%2Cft_binding&format=tsv&query=%28%28taxonomy_id%3A9606%29+OR+%28taxonomy_id%3A10090%29%29+AND+%28reviewed%3Atrue%29",
destfile = "../data/_raw/sequence_file.tsv.gz")Reading raw data
data_function <- read_delim(file = "../data/_raw/function_file.tsv.gz", delim = "\t")
data_sequence <- read_delim(file = "../data/_raw/sequence_file.tsv.gz", delim = "\t")Join data
# joining data
joined_data <- data_function |>
select(Entry, setdiff(colnames(data_function), colnames(data_sequence))) |>
full_join(data_sequence,
by = "Entry")Save data
write_tsv(joined_data, file = "../data/01_dat_load.tsv")Clean and join data
Load libraries
library(tidyverse)Read data
# Reading data
joined_data <- read_delim(file = "../data/01_dat_load.tsv",
delim = "\t")Cleaning part 1: Active site and binding site
# Changing space in column names to "_" and lower case letters
colnames(joined_data) <- str_replace_all(colnames(joined_data),
" ",
"_")
colnames(joined_data) <- str_to_lower(colnames(joined_data))
#####
# Drop all unwanted cols
#####
joined_data <- joined_data |>
select(-reviewed,-`function_[cc]`,
-temperature_dependence,
-protein_names,
-intramembrane,
-catalytic_activity,
-entry_name,
-transmembrane)
####
# Tidying "active site" column
####
joined_data <- joined_data |>
mutate(active_site = str_extract_all(active_site,
"(?<=ACT_SITE )\\d+")) |>
mutate(active_site = map_chr(active_site, ~paste(.x,
collapse = ","))) |>
separate_longer_delim(col = active_site, ",")
####
# Tidying "binding site" column
####
joined_data <- joined_data |>
mutate(binding_site_pos = str_extract_all(binding_site,
"(?<=BINDING\\s)\\d+(\\.\\.\\d+)?"),
binding_site_ligand = str_extract_all(binding_site,
"(?<=/ligand=\"\")[^\"]+(?=\")")) |>
mutate(binding_site_pos = map_chr(binding_site_pos,
~paste(.x, collapse = ";")),
binding_site_ligand = map_chr(binding_site_ligand,
~paste(.x, collapse = ";"))) |>
filter(str_count(binding_site_pos, ";") == str_count(binding_site_ligand, ";")) |>
separate_longer_delim(col = c(binding_site_pos,
binding_site_ligand),
";") |>
select(-binding_site)
# Converting to proper NA values
joined_data <- joined_data |>
mutate(binding_site_pos = na_if(binding_site_pos, "NA")) |>
mutate(binding_site_ligand = na_if(binding_site_ligand, "NA")) |>
mutate(active_site = na_if(active_site, "NA"))Cleaning part 2: Cleaning the remaining columns
###
# Tidy "pathway" column
###
joined_data <- joined_data |>
# Remove unwanted patterns (step is removed since it is a small minority that has this noted)
mutate(pathway = str_remove_all(pathway, "^PATHWAY: "),
pathway = str_remove_all(pathway, "pathway: "),
pathway = str_remove_all(pathway, "\\{.*?\\}"),
pathway = str_remove_all(pathway, ": step \\d+/\\d+"),
pathway = str_remove_all(pathway, "\\."),
pathway = str_remove(pathway, "\\s$")) |>
# Split into longer form
separate_longer_delim(col = pathway, delim = "; ") |>
mutate(pathway = tolower(pathway))
###
# Tidy "subcellular_location_[cc]" column
###
joined_data <- joined_data |>
# Remove header and Notes
mutate(`subcellular_location_[cc]` = str_remove(`subcellular_location_[cc]`, "Note=.*"),
`subcellular_location_[cc]` = str_remove_all(`subcellular_location_[cc]`, "SUBCELLULAR LOCATION: "),
`subcellular_location_[cc]` = str_remove_all(`subcellular_location_[cc]`, "\\{.*?\\}"),
`subcellular_location_[cc]` = str_remove_all(`subcellular_location_[cc]`, "\\[.*?\\]"),
`subcellular_location_[cc]` = str_remove_all(`subcellular_location_[cc]`, "\\.\\s*$")) |>
# Use delimiter to make data longer (there are several for cofactors)
separate_longer_delim(col = `subcellular_location_[cc]`, delim = "; ") |>
separate_longer_delim(col = `subcellular_location_[cc]`, delim = ". ") |>
separate_longer_delim(col = `subcellular_location_[cc]`, delim = ", ") |>
# Remove '.' ':' and whitespace in beginning and end of strings
mutate(`subcellular_location_[cc]` = str_remove(`subcellular_location_[cc]`, "^[. ]+|[. ]+$"),
`subcellular_location_[cc]` = str_remove(`subcellular_location_[cc]`, "\\: ")) |>
# Convert to lower case
mutate(`subcellular_location_[cc]` = tolower(`subcellular_location_[cc]`)) |>
# Removes space at the end
mutate(`subcellular_location_[cc]` = str_remove(`subcellular_location_[cc]`, "\\s$"))
####
# Tidy "cofactor" column
####
joined_data <- joined_data |>
# keep only cofactor names in column
mutate(cofactor = str_extract_all(cofactor, "(?<=Name=)[^;]+")) |>
mutate(cofactor = map_chr(cofactor, ~paste(.x, collapse = "_"))) |>
separate_longer_delim(col = cofactor, "_") |>
mutate(cofactor = na_if(cofactor, "NA")) |>
mutate(cofactor = tolower(cofactor),
cofactor = str_remove(cofactor, "\\s$"))
####
# Tidy "protein_families" column
####
joined_data <- joined_data |>
# Use delimiter to make data longer
separate_longer_delim(protein_families, delim = ", ")|>
# Extract family type and make new column to put data in + remove from colum
mutate(family_type = str_extract(protein_families,
pattern = "\\S*family\\b", group = NULL),
protein_families = str_remove(protein_families,
pattern = "\\S*family\\b")) |>
separate_longer_delim(protein_families, delim = "; ") |>
# Rename column as there is only one protein family in each row now
rename(protein_family = protein_families) |>
relocate(protein_family, .before = family_type)
####
#Tidy "gene_ontology_(molecular_function)" column
####
joined_data <- joined_data |>
# Give column a better name
rename(GO_molecular_function = `gene_ontology_(molecular_function)`) |>
# Remove GO numbers
mutate(GO_molecular_function = str_remove_all(GO_molecular_function, "\\[.*?\\]")) |>
# Use delimiter to make data longer
separate_longer_delim(GO_molecular_function, delim = " ; ") |>
# Removes space at the end
mutate(GO_molecular_function = str_remove(GO_molecular_function, "\\s$"))
####
# Tidy "pH dependence" column
####
# Extracting the optimum pH
pH_str_search <- "Optimum pH is (\\d+\\.\\d*)-(\\d+\\.\\d*)|Optimum pH is (\\d+\\.\\d*)|Optimum pH is around (\\d+\\.\\d*)|Optimum pH is about (\\d+\\.\\d*)|Optimum pH is (\\d+\\.\\d*) to (\\d+\\.\\d*)|Optimum pH is between (\\d+\\.\\d*) and (\\d+\\.\\d*)"
#Function for creating two new columns for pH interval values
pH_columns <- str_c("pH_", seq(from = 1, to = 2))
# Starting to clean ph_dependence
joined_data <- joined_data |>
#Removes text and keeps optimum pH values
mutate(ph_dependence = str_extract(ph_dependence, pH_str_search),
ph_dependence = str_remove(ph_dependence, "Optimum pH is"),
ph_dependence = str_remove(ph_dependence, "about|around|between"),
ph_dependence = str_replace(ph_dependence, " and ", "-")) |>
#Creates a new variable with pH interval
mutate(pH_interval = str_extract(ph_dependence, "(\\d+\\.\\d*)-(\\d+\\.\\d*)")) |>
#Separates pH interval into columns
separate(pH_interval, into = pH_columns, sep = "-") |>
#Inserts optimum pH values from pH dependence if it is not and interval
mutate(pH_1 = coalesce(pH_1, ph_dependence),
pH_2 = coalesce(pH_2, ph_dependence)) |>
#Changes chr to dbl and calculates average pH optimum
mutate(pH_1 = as.numeric(pH_1),
pH_2 = as.numeric(pH_2),
ph_dependence = (pH_1+pH_2)/2) |>
select(-pH_1,-pH_2)
###
# Tidy "gene name" column (select the first one)
###
joined_data <- joined_data |>
# Only keep the first gene name
mutate(gene_name = str_extract(gene_names, "^\\S*")) |>
# Remove gene names column
select(-gene_names) |>
# Remove any duplicate entries
distinct()
###
# Tidy "organism" column
###
joined_data <- joined_data |>
# Check that there are only Homo sapiens (Human) and Mus musculus (Mouse)
filter(organism == "Homo sapiens (Human)" | organism == "Mus musculus (Mouse)") |>
# Shorten organism name
mutate(organism = case_when(
organism == "Homo sapiens (Human)" ~ "Human",
organism == "Mus musculus (Mouse)" ~ "Mouse"))Save cleaned data object
write_tsv(joined_data, file = "../data/02_dat_clean.tsv")Augmentation of data
Load librares
library(tidyverse)Load clean data
clean_data <- read_delim(file = "../data/02_dat_clean.tsv", delim = "\t")Rows: 2287129 Columns: 17
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (13): entry, cofactor, pathway, ec_number, GO_molecular_function, subcel...
dbl (4): ph_dependence, length, mass, active_site
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Augment data
Add column with peptide sequence around active site
# Function add column with sequence around active site
get_logo_seq <- function(x, y) {
y <- as.integer(y)
logo_seq_length <- 7
n_aa_per_side <- (logo_seq_length-1/2)
return(substr(x, y-3, y+3))
}
# Extracting peptides around active site and saving in new column
aug_data <- clean_data |>
mutate(logo_seq = get_logo_seq(sequence, active_site)) |>
# Filtereing out k-mers not covering 7 AA's
mutate(logo_seq = if_else(nchar(logo_seq) != 7, NA, logo_seq))Add count for the number of molecular functions for each protein
####
# Count the number of molecular functions for each entry
####
molecular_function_count <- clean_data |>
select(organism, entry, GO_molecular_function, length, mass) |>
distinct() |>
drop_na() |>
count(entry, name = "num_molecular_functions")
# Join num_molecular_functions
aug_data <- aug_data |>
full_join(molecular_function_count, by = "entry")Add number of cofactors that binds to each protein
####
# Count the number of cofactors
####
cofactor_count <- clean_data |>
select(organism, entry, cofactor, length, mass) |>
distinct() |>
drop_na() |>
count(entry, name = "num_cofactors")
# Join num_molecular_functions
aug_data <- aug_data |>
full_join(cofactor_count, by = "entry")Convert the first EC number into an enzyme class number
####
# Change "EC" column (we have decided only to work with the first digit but we are aware that the others have information aswell)
####
aug_data <- aug_data |>
# Extract the first digit and create new column
# For the few cases with more ec numbers we just look at the first
mutate(first_digit = str_extract(ec_number, "^\\d"),
# Translate digit to enzyme class using vector
enzyme_class = case_when(
first_digit == 1 ~ "oxidoreductases",
first_digit == 2 ~ "transferases",
first_digit == 3 ~ "hydrolases",
first_digit == 4 ~ "lyases",
first_digit == 5 ~ "isomerases",
first_digit == 6 ~ "ligases",
first_digit == 7 ~ "translocases",
)) |>
select(-first_digit, -ec_number)Save augment data object
write_tsv(aug_data, file = "../data/03_dat_aug.tsv")Protein size analysis
Load augmented data
library(tidyverse)
# Load data
data <- read_delim(file = "../data/03_dat_aug.tsv", delim = "\t")Plot number of molecular functions vs length
We wanted to see if proteins with more molecular functions were larger
#####
# Plot counted number of molecular functions vs length
#####
data |>
select(organism,
entry,
length,
num_molecular_functions) |>
distinct() |>
drop_na() |>
ggplot(aes(x = length,
y = num_molecular_functions,
fill = organism,
colour = organism)) +
geom_point(alpha = 0.5,
size = 0.75)+
labs(title = "Length vs. Molecular Function Count",
x = "Length",
y = "Number of Molecular Functions") +
theme_bw()ggsave("05_size_analysis_mol_func.png",
path = "../results")Saving 7 x 5 in image
Plot boxplot of cofactor vs length
We wanted to see if proteins that had more cofactors were generally larger. We chose to do this as a boxplot because the maximum number of cofactors was 5 and it was therefore a visually manageable plot that eased visualization substantially. it did however require that the number of cofactors was treated as a factor.
#####
# Plot counted number of cofactors functions vs length
#####
data |>
select(organism,
entry,
length,
num_cofactors)|>
distinct()|>
drop_na()|>
# num_cofactors is converted to factor to enable use of boxplot
ggplot(aes(y = length,
fill = factor(num_cofactors),
colour = factor(num_cofactors))) +
geom_boxplot(alpha = 0.5,
size = 0.75)+
labs(title = "Boxplot of length by number of binding cofactors",
y = "Length",
fill = "Number of cofactors",
color = "Number of cofactors") +
facet_wrap(~organism)+
# Extreme outliers are ignored to improve visualization
scale_y_continuous(limits = c(0, quantile(data$length, 0.98))) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).
ggsave("05_size_analysis_cofactor_size.png",
path = "../results")Saving 7 x 5 in image
Warning: Removed 17 rows containing non-finite values (`stat_boxplot()`).
Plot size vs mass in kDa
We were wondering if humans used more large side chains than mice. To investigate this we plotted length vs mass and performed linear regression. We made one linear model for each organism and plotted the 95% confidence interval as to enable easy visualization to check if they were significantly different from each other. Should this be the case it would indicate that the two species do in fact have differences in the size of their amino acid side chains.
#####
# Plot size vs mass (fix the structure to use pipes)
#####
data|>
select(organism,
entry,
length,
mass)|>
distinct()|>
drop_na()|>
# Create scatter plot with linear regression lines
ggplot(aes(x = mass,
y = length,
color = organism)) +
geom_point(alpha = 0.25,
size = 0.75) +
# Add linear regression line for human
geom_smooth(data = data |>
filter(organism == "Human"),
method = "lm",
se = TRUE,
formula = y ~ x,
linetype = "solid",
colour = "red")+
# Add linear regression line Mouse
geom_smooth(data = data |>
filter(organism == "Mouse"),
method = "lm",
se = TRUE,
formula = y ~ x,
linetype = "dashed",
colour = "blue")+
labs(title = "Scatter Plot of Length vs. Mass with Linear Regressions",
x = "Mass (kDa)",
y = "Length",
color = "Organism") +
# Extreme outliers are ignored to improove visualization
scale_y_continuous(limits = c(0, quantile(data$length, 0.99))) +
scale_x_continuous(labels = function(x) format(x, scientific = FALSE),
limits = c(0, quantile(data$mass, 0.99)))+
theme_bw()Warning: Removed 10799 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 12163 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 111 rows containing missing values (`geom_point()`).
Warning: Removed 2 rows containing missing values (`geom_smooth()`).
Warning: Removed 3 rows containing missing values (`geom_smooth()`).
ggsave("05_size_analysis_lm_size_length.png",
path = "../results")Saving 7 x 5 in image
Warning: Removed 10799 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 12163 rows containing non-finite values (`stat_smooth()`).
Warning: Removed 111 rows containing missing values (`geom_point()`).
Warning: Removed 2 rows containing missing values (`geom_smooth()`).
Warning: Removed 3 rows containing missing values (`geom_smooth()`).
linear_models <- lm(length ~ mass + organism,
data = data|>
select(organism,
entry,
length,
mass)|>
distinct()|>
drop_na())pH Length investigation
The idea was to see if there was any connection between protein size and the optimal pH for the protein. The datapoints are coloured by organism to see if there are any clear differences between mice and humans.
####
# Assemble size-pH
####
data|>
select(organism,
entry,
length,
mass,
ph_dependence)|>
distinct()|>
drop_na()|>
# 368 distinct entries
# Plot a scatter plot of the
ggplot(aes(x= length,
y = ph_dependence,
fill = organism,
colour = organism)) +
geom_point(alpha = 0.75,
size = 0.75)+
labs(title = "Scatter Plot of length vs. pH",
x = "Sequence length",
y = "pH optimum")ggsave("05_size_analysis_pH_length.png",
path = "../results")Saving 7 x 5 in image
pH enzyme class comparison (needs to be moved)
We decided to test if enzyme classes had varying pH dependencies between humans and mice. We therefore made a boxplot of the pH dependencies on all enzyme classes and stratified on the organism to compare the two.
# Needs to be moved
####
# pH by enzyme class test
####
data|>
# Filter remoove unwanted columns and rows from data
select(organism,
entry,
length,
mass,
ph_dependence,
enzyme_class) |>
distinct() |>
drop_na() |>
# Plot the results
ggplot(aes(y = ph_dependence,
fill = enzyme_class,
colour = enzyme_class)) +
geom_boxplot(alpha = 0.75, size = 0.9) +
labs(color = "Enzyme class",
fill = "Enzyme class",
y = "pH optimum") +
facet_wrap(~organism)# 310 distinct entries
data|>
select(organism,
entry,
ph_dependence,
enzyme_class)|>
filter(organism == "Mouse",
enzyme_class == "ligases")|>
drop_na()|>
distinct(entry)# A tibble: 5 × 1
entry
<chr>
1 A4Q9F0
2 P28650
3 P46664
4 P48760
5 Q3THK7
# Only 5 ligases in mice which is not enough to determine anything Plotting Length vs enzyme class
The idea was to see if mice and human proteins varied in size when looking at specific enzyme classes.
####
# Size by enzyme class test
####
data|>
select(organism,
entry,
length,
mass,
enzyme_class) |>
distinct() |>
drop_na() |>
ggplot(aes(y = length,
fill = enzyme_class,
colour = enzyme_class)) +
geom_boxplot(alpha = 0.75, size = 0.9) +
labs(color = "Enzyme class",
fill = "Enzyme class",
y = "Length") +
facet_wrap(~organism) +
scale_y_continuous(limits = c(0, quantile(data$length, 0.98))) +
theme(axis.text.x = element_text(angle = 45, hjust = 1))Warning: Removed 92 rows containing non-finite values (`stat_boxplot()`).
# 310 distinct entries
ggsave("05_size_analysis_class_size.png",
path = "../results")Saving 7 x 5 in image
Warning: Removed 92 rows containing non-finite values (`stat_boxplot()`).
Plotting size by subcellular loaction
The idea was to see if protein sizes varied in subcellular locations and to check if there were any differences between mice and humans.
####
# Size by subcellular location
####
# Plot boxplots for protein sizes in three subcellular locations
data |>
filter(`subcellular_location_[cc]` == c("cell membrane", "cytoplasm","nucleus")) |>
ggplot(aes(y = length,
fill = `subcellular_location_[cc]`,
colour = `subcellular_location_[cc]`)) +
geom_boxplot(alpha = 0.75, size = 0.9) +
facet_wrap(~organism) +
scale_y_continuous(limits = c(0, quantile(data$length, 0.99))) +
labs(title = "Boxplot of protein lengths in three subcellular locations",
y = "Protein Length",
fill = "Subcellular location",
color = "Subcellular location") +
theme(legend.position = "bottom")Warning: There was 1 warning in `filter()`.
ℹ In argument: `==...`.
Caused by warning in `` `subcellular_location_[cc]` == c("cell membrane", "cytoplasm", "nucleus") ``:
! longer object length is not a multiple of shorter object length
Warning: Removed 2392 rows containing non-finite values (`stat_boxplot()`).
ggsave("05_size_analysis_location_size.png",
path = "../results")Saving 7 x 5 in image
Warning: Removed 2392 rows containing non-finite values (`stat_boxplot()`).
Decription of the data used in this project
Loading libraries
## Loading libraries
library(tidyverse)
library(knitr)
library(patchwork)Loading data
Introduction
We have downloaded the data of all the proteins of mouse and human in the Swiss-Prot database. The Swiss-Prot database is a freely accessible database of manually annotated and non-redundant proteins. The annotations are regularly reviewed to include potentially new findings regarding a specific protein. The same gene from the same species only get one entry in the database to avoid redundancy. All this information about the database makes the data trustworthy which is important for us. The goal is to examine the data to see if it holds anything interesting on a large-scale analysis where we use the general categories. We have decided to focus on mouse and human. We expect the two organisms to show high similarity as they are both mammals and the research of proteins in human and mouse often include the use of ortolog comparison. However, we dug into the data to see if we despite high similarity could extract some differences. Or are we just very much like the mouse? The data was fun to work with as it was untidy and required a lot of cleaning before we could start with the analysis. It was a high motivation for us to practice the data wrangling part as much as possible as we saw this part as the most challenging content in the course. The following part will show some general descriptive statistics we made to get an overview of the data, we ended up using for our project.
Descriptive statistics
The data includes information about the function of each of the 36990 proteins and the sequence and much more. For a start, we merge the two raw data files (function and sequence) into one. From this point, we started excluding variables which we did not want to continue with. We excluded the variables reviewed, function, temperature_dependence, protein_names, intramembrane, catalytic_activity, entry_name, and transmembrane. These columns were not informative as some of them were just long text descriptions, (e.g., function), reaction equations (e.g., catalytic_activity), redundant (entry_name), or useless. After some data wrangling (which can be seen in “02_clean.qmd”), we ended up with the following variables: The data includes information about the function, the sequence, and much more of each of the 36990 proteins. For a start, we merged the two raw data files (function and sequence) into one. From this point, we started excluding variables that we did not want to keep. We excluded the variables “reviewed”, “function”, “temperature_dependence”, “protein_names”, “intramembrane”, “catalytic_activity”, “entry_name”, and “transmembrane”. These columns were not informative as some of them were just long text descriptions, (e.g., “function”), reaction equations (e.g., “catalytic_activity”), redundant (“entry_name”), or useless. After some data wrangling (which can be seen in “02_clean.qmd”), we ended up with the following variables:
The different variables (columns) in the dataset:
# Extracting the column names
data |>
select(-entry, -gene_name) |>
colnames() [1] "cofactor" "ph_dependence"
[3] "pathway" "GO_molecular_function"
[5] "subcellular_location_[cc]" "organism"
[7] "length" "mass"
[9] "sequence" "active_site"
[11] "binding_site_pos" "binding_site_ligand"
[13] "protein_family" "family_type"
[15] "logo_seq" "num_molecular_functions"
[17] "num_cofactors" "enzyme_class"
Originally the dataset also contained entries from other mouse-species and from neanderthal. These have been removed. The proteins are described with 15 variables (including the species):
- pathway:
- e.g., purine metabolism, protein modification, protein ubiquitination
- cofactor:
- e.g., heme, mg(2+), zn(2+)
- ph_dependence
- subcellular_location_[cc]
- e.g., cytoplasm, nucleus, cell membrane
- GO_molecular_function
- e.g., metal ion binding, ATP binding, G protein-coupled receptor activity
- organism:
- Mouse or Human
- length
- The lenght of the amino acid sequence
- mass
- Mass of the protein in kilo dalton [kDa]
- sequence
- The amino acid sequence
- active_site
- Position of the active site
- binding_site_pos
- Position of the binding site
- binding_site_ligand
- e.g., Ca(2+), heme, ATP
- protein_family
- e.g., Protein kinase, G-protein coupled receptor 1
- family_type
- family / subfamily
- enzyme_class
- e.g., transferases, hydrolases
Protein amount for each organism
We do not see a great difference in the amount of proteins described for mouse and human. The following plot visualizes how similar the count is.
## This plot shows the total count of human proteins in the database vs. mouse proteins in the database
data |>
ggplot(mapping = aes(x=organism,
fill = organism)) +
geom_bar() +
theme_bw() +
theme(legend.position = "none",
axis.title.y = element_text(angle = 0,
vjust = 0.5,
size = 10),
axis.text.x = element_text(angle = 0,
size = 10)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
labs(title = "Total count of human and mouse",
y = "Count",
x = "Organism")## Saving the plot in results
ggsave("04_descibe_plot1_totalcount.png",
path = "../results")Saving 7 x 5 in image
The low difference in amount of proteins for mouse and human respectively is illustrating the low difference between mouse and human in general. The following plots shows the difference in length (first plot) and mass (second plot) for mouse and human. It is clear that the density plots for mouse and human respectively are very similar. This indicates that the proteins described in human has an ortholog in mouse which explains the low difference in length and mass.
Density plot for length of the proteins in the database stratified on mouse and human
## Density plot for the length of the proteins stratified on mouse and human
data |>
ggplot(aes(x = length,
fill = organism)) +
xlim(0, 2500) +
geom_density(alpha = 0.5) +
theme_bw() +
theme(axis.title.y = element_text(angle = 0,
vjust = 0.5,
size = 11),
axis.text.x = element_text(angle = 40,
hjust = 1,
size = 11)) +
labs(title = "Density plot of the length of the proteins in the database",
y = "Density",
x = "Length",
fill = "Organism")## Saving the plot in results
ggsave("04_descibe_plot2_density_length.png",
path = "../results")Density plot for the mass of the proteins in the database stratified on mouse and human
## Density plot for the mass of the proteins stratified on mouse and human
data |>
ggplot(aes(x = mass,
fill = organism)) +
geom_density(alpha = 0.5) +
scale_x_continuous(
labels = function(x) format(x, scientific = FALSE),
limits = c(0, 300000)
) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
theme_bw() +
theme(axis.title.y = element_text(angle = 0,
vjust = 0.5,
size = 11),
axis.text.x = element_text(angle = 40,
hjust = 1,
size = 11)) +
labs(title = "Density plot of the mass of the proteins in the database",
y = "Density",
x = "Mass",
fill = "Organism")## Saving the plot in results
ggsave("04_descibe_plot3_density_mass.png",
path = "../results")## Making a patchwork of the two density plots above
## Density plot for the length
length_density_plot <- data |>
ggplot(aes(x = length,
fill = organism)) +
xlim(0, 2500) +
geom_density(alpha = 0.5) +
theme_bw() +
theme(axis.title.y = element_text(angle = 0,
vjust = 0.5,
size = 11),
axis.text.x = element_text(angle = 40,
hjust = 1,
size = 11)) +
labs(y = "Density",
x = "Length") +
guides(fill = FALSE)Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
of ggplot2 3.3.4.
## Density plot for the mass
mass_density_plot <- data |>
ggplot(aes(x = mass,
fill = organism)) +
geom_density(alpha = 0.5) +
scale_x_continuous(
labels = function(x) format(x, scientific = FALSE),
limits = c(0, 300000)
) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
theme_bw() +
theme(axis.text.x = element_text(angle = 40,
hjust = 1,
size = 11),
axis.title.y = element_blank()) +
labs(x = "Mass",
fill = "Organism")
## Combine the plots
combined_length_mass <- length_density_plot +
mass_density_plot
## Print the combined plots
print(combined_length_mass)Warning: Removed 55729 rows containing non-finite values (`stat_density()`).
Warning: Removed 32649 rows containing non-finite values (`stat_density()`).
## Saving the plot in results
ggsave("04_descibe_plot2and3_density_length_and_mass.png",
path = "../results")Saving 7 x 5 in image
Warning: Removed 55729 rows containing non-finite values (`stat_density()`).
Removed 32649 rows containing non-finite values (`stat_density()`).
Unique entries of each variable
The variables that we kept have a different count of unique entries, e.g., there are 7 different entries for “enzyme_class” and 538 different “GO_molecular_function”. The following table counts the number of unique entries of each variable.
## Table of the unique entries of each variable
summary_table <- data |>
select(
cofactor,
`subcellular_location_[cc]`,
GO_molecular_function,
protein_family,
enzyme_class,
pathway) |>
drop_na() |>
pivot_longer(
cols = c(
cofactor,
`subcellular_location_[cc]`,
GO_molecular_function,
protein_family,
enzyme_class,
pathway
),
names_to = "Column",
values_to = "Value"
) |>
group_by(Column) |>
summarise(Unique_Entries = n_distinct(Value))
# Using kable() to make the table
kable(summary_table)| Column | Unique_Entries |
|---|---|
| GO_molecular_function | 576 |
| cofactor | 31 |
| enzyme_class | 7 |
| pathway | 246 |
| protein_family | 179 |
| subcellular_location_[cc] | 91 |
To further improve the overview of the unique entries, we have made a bar plot showing the same as the table:
Illustrated with a bar plot as well
## Bar plot of the unique entries for each variable
data |>
select(cofactor,
`subcellular_location_[cc]`,
GO_molecular_function,
protein_family,
enzyme_class,
pathway) |>
drop_na()|>
pivot_longer(cols = c(cofactor,
`subcellular_location_[cc]`,
GO_molecular_function, protein_family,
enzyme_class, pathway),
names_to = "Column",
values_to = "Value") |>
group_by(Column) |>
summarise(Unique_Entries = n_distinct(Value)) |>
ggplot(aes(x=Column,
y=Unique_Entries,
fill = Column)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(
axis.title.y = element_text(angle = 0,
vjust = 0.5,
size = 11),
axis.text.x = element_text(angle = 40,
hjust = 1,
size = 11),
) +
guides(fill = FALSE) +
labs(title = "Amounts of unique entries for the different variables",
x = "Variable",
y = "Unique entries")## Saving the plot in results
ggsave("04_descibe_plot4_unique_entries.png",
path = "../results")Saving 7 x 5 in image
The unique entries of the variable “enzyme_class”
Now we dig into a specific variable to give an example of how the variables look like. The “enzyme_class” contains seven unique entries. The amount of each entry is shown in the plot below:
data |>
select(enzyme_class) |>
drop_na() |>
group_by(enzyme_class) |>
summarise(Amount = n()) |>
ggplot(aes(x=enzyme_class,
y=Amount,
fill=enzyme_class)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(
axis.title.y = element_text(angle = 0,
vjust = 0.5,
size = 11),
axis.text.x = element_text(angle = 40,
hjust = 1,
size = 11),
) +
guides(fill = FALSE) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
labs(title = "Amount of different enzyme classes",
y = "Amount",
x = "Enzyme class")In this project, we have primarilt tried to examine a difference between mouse and human. The following plot is an example of how we have dug into the data with the intention to visualized the difference between mouse and human in different categories.
## Amount of different enzyme classes in Human
enzymes_Human <- data |>
select(enzyme_class, organism) |>
drop_na() |>
filter(organism == "Human") |>
group_by(enzyme_class) |>
summarise(Amount = n()) |>
ggplot(aes(x=enzyme_class,
y=Amount,
fill=enzyme_class)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.title.x = element_blank(),
axis.text.x = element_text(angle = 40,
hjust = 1,
size = 10),
plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
labs(title = "Human") +
guides(fill = FALSE)
## Amount of different enzyme classes in Mouse
enzyme_Mouse <- data |>
select(enzyme_class,
organism) |>
drop_na() |>
filter(organism == "Mouse") |>
group_by(enzyme_class) |>
summarise(Amount = n()) |>
ggplot(aes(x=enzyme_class,
y=Amount,
fill=enzyme_class)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.title.x = element_blank(),
axis.title.y = element_blank(),
axis.text.x = element_text(angle = 40,
hjust = 1,
size = 10),
plot.title = element_text(hjust = 0.5)) +
scale_y_continuous(labels = function(x) format(x, scientific = FALSE)) +
labs(title = "Mouse") +
guides(fill = FALSE)
## Combining the two plots with patchwork
p <- enzymes_Human +
enzyme_Mouse +
plot_annotation("Amount of different enzyme classes in human and mouse",
theme=theme(plot.title=element_text(hjust=0.5)))
p## Saving the plot in results
ggsave("04_descibe_plot5_different_enzyme_classes.png",
path = "../results")Saving 7 x 5 in image
Sequence logo plots for peptides around active site
In this analysis we will create and compare logo plots for human- and mouse-entries with available “active site” information in uniprot.
We do so first building sequence logo plots for different protein families. Later we zoom out and look at enzyme classes instead, as defined by the first digit of the EC number (see data augmentation steps).
Load librares
library(ggseqlogo)
library(patchwork)
library(tidyverse)
library(knitr)Load data
aug_data <- read_delim(file = "../data/03_dat_aug.tsv",
delim = "\t")Identify protein families to look at
The purpose of this step is to identify the protein families where most information is available.
The table in the output message will show the 10 most frequently observed protein families as an average across human and mice.
# Filter out if no active site info
logoseq_data <- aug_data |>
drop_na(logo_seq) |>
filter(family_type == "family") |>
select(entry,
organism,
protein_family,
family_type,
logo_seq)
# See what protein families with active site data occur most often in mouse
top_prot_fam_mouse <- logoseq_data |>
drop_na(protein_family) |>
filter(str_detect(organism,
"Mouse")) |>
select(entry,
protein_family,
logo_seq) |>
distinct() |>
group_by(protein_family) |>
summarize(n_mouse = n()) |>
arrange(desc(n_mouse))
# See what protein families with active site data occur most often in human
top_prot_fam_human <- logoseq_data |>
drop_na(protein_family) |>
filter(organism == "Human") |>
select(entry, protein_family, logo_seq) |>
distinct() |>
group_by(protein_family) |>
summarize(n_human = n()) |>
arrange(desc(n_human))
# Combine mouse and human protein family counts and available active site information
top_prot_fam <- top_prot_fam_mouse |>
full_join(top_prot_fam_human,
by = "protein_family") |>
mutate(n_average = (n_mouse + n_human) /2) |>
arrange(desc(n_average))
top_prot_fam |>
slice(1:10)|> knitr::kable()| protein_family | n_mouse | n_human | n_average |
|---|---|---|---|
| Peptidase S1 | 363 | 326 | 344.5 |
| Peptidase C19 | 100 | 133 | 116.5 |
| Protein-tyrosine phosphatase | 92 | 97 | 94.5 |
| Tyr protein kinase | 88 | 86 | 87.0 |
| CAMK Ser/Thr protein kinase | 79 | 72 | 75.5 |
| AGC Ser/Thr protein kinase | 60 | 64 | 62.0 |
| CMGC Ser/Thr protein kinase | 61 | 62 | 61.5 |
| Ser/Thr protein kinase | 62 | 60 | 61.0 |
| Lipase | 54 | 63 | 58.5 |
| STE Ser/Thr protein kinase | 54 | 54 | 54.0 |
# Extract names of protein families in arranged order, to use for further analysis
top_prot_fam_name_vec <- top_prot_fam |>
pull(protein_family)Logo plots for protein families
Here we use ggseqlogo() to visualize peptide sequences for each of the top 5 most frequently observed protein families as logo plots. The logo plots will be stitched together using both the inbuilt facet function of gggseqlogo() and patchwork.
The logo plot for mouse and human protein families are very similar, showing that mouse and human homologs very similar around the active sites. In addition, this analysis show how uniprot data can be utilized to gain knowledge about active sites for specific protein families.
### Protein family logo plots ###
# In this code chunk we work with the top 5 most frequently observed protein families
# Extract mouse data and filter out if no active site info
logoseq_data_mouse_prot <- aug_data |>
drop_na(logo_seq, protein_family) |>
filter(protein_family %in% top_prot_fam_name_vec[1:5]) |>
filter(str_detect(organism, "Mouse")) |>
select(entry,
organism,
protein_family,
logo_seq) |>
distinct()
# Extract human data and filter out if no active site info
logoseq_data_human_prot <- aug_data |>
drop_na(logo_seq, protein_family) |>
filter(protein_family %in% top_prot_fam_name_vec[1:5]) |>
filter(str_detect(organism, "Human")) |>
select(entry,
organism,
protein_family,
logo_seq) |>
distinct()
# Change proten family names to get line breaks (check if names match before changing them)
# Converting data to named list for use with facet function in ggseqlogo
# For mouse
logo_list_mouse_prot <- logoseq_data_mouse_prot |>
select(logo_seq, protein_family) |>
group_split(protein_family) |>
map(~ pull(.x, logo_seq)) |>
map(unique) # Filter out identical sequences
names(logo_list_mouse_prot) <- c("CAMK Ser/Thr\nprotein kinase",
"Peptidase C19 ",
"Peptidase S1 ",
"Protein-tyrosine\nphosphatase ",
"Tyr protein\nkinase ")
# For human
logo_list_human_prot <- logoseq_data_human_prot |>
select(logo_seq, protein_family) |>
group_split(protein_family) |>
map(~ pull(.x, logo_seq)) |>
map(unique) # Filter out identical sequences
names(logoseq_data_human_prot) <- c("CAMK Ser/Thr\nprotein kinase",
"Peptidase C19 ",
"Peptidase S1 ",
"Protein-tyrosine\nphosphatase ",
"Tyr protein\nkinase ")Warning: The `value` argument of `names<-` must have the same length as `x` as of tibble
3.0.0.
# Making logo plot for mouse protein families
logo_plot_mouse_prot <- ggseqlogo(logo_list_mouse_prot,
ncol=7) +
ggtitle("Mouse protein families") +
theme(plot.title = element_text(hjust = 0.5,
size = 12),
strip.text = element_text(size = 8)) +
scale_x_continuous(breaks = 1:7, # Existing tick positions
labels = -3:3 # Existing tick labels
)
# Making logo plot for human protein families
logo_plot_human_prot <- ggseqlogo(logo_list_human_prot,
ncol=7) +
ggtitle("Human protein families") +
theme(plot.title = element_text(hjust = 0.5,
size = 12),
strip.text = element_text(size = 8)) +
scale_x_continuous(breaks = 1:7, # Existing tick positions
labels = -3:3 # Existing tick labels
)
# Combining mouse and human logo plot for protein families
logo_comb_prot <- logo_plot_human_prot / logo_plot_mouse_prot
logo_comb_prot# Save combined logo plot as png
ggsave(filename = "../results/06_logoplot_prot_fam.png",
logo_comb_prot)Logo plots for enzyme class
Here we use ggseqlogo() to visualize peptide sequences for different types of enzyme classes according to the first digit in their EC number. We filtered out “translocases”, because very few entries very observed for this enzyme class resulting in an artifically high information content. The logo plots will be stitched together using both the inbuilt facet function of gggseqlogo() and patchwork.
Just as for the protein family logo plots, the logo plots for mouse and human enzyme classes are very similar. Enzyme classes are less specific than protein families, and thus we naturally observe lower information content in general. However we thought that “zooming out”, might reveal some differences between human and mice enzymes in general, but this was not the case.
### Enzyme class logo plots ###
# Extract mouse data and filter out if no active site info
logoseq_data_mouse_enz <- aug_data |>
drop_na(logo_seq, enzyme_class) |>
filter(str_detect(organism,
"Mouse")) |>
# Removing translocases because limited data available
filter(!enzyme_class == "translocases") |>
select(entry,
organism,
enzyme_class,
logo_seq) |>
distinct()
# Extract human data and filter out if no active site info
logoseq_data_human_enz <- aug_data |>
drop_na(logo_seq, enzyme_class) |>
filter(str_detect(organism,
"Human")) |>
# Removing translocases because limited data available
filter(!enzyme_class == "translocases") |>
select(entry,
organism,
enzyme_class,
logo_seq) |>
distinct()
# Converting data to named list for facet function in ggseqlogo: mouse data
logo_list_mouse_enz <- logoseq_data_mouse_enz |>
select(logo_seq, enzyme_class) |>
group_split(enzyme_class) |>
map(~ pull(.x, logo_seq)) |>
map(unique) # Filter out identical sequences
names(logo_list_mouse_enz) <- c("Hydrolases",
"Isomerases",
"Ligases",
"Lyases",
"Oxidoreductases",
"Transferases")
# Converting data to named list for facet function in ggseqlogo: human data
logo_list_human_enz <- logoseq_data_human_enz |>
select(logo_seq, enzyme_class) |>
group_split(enzyme_class) |>
map(~ pull(.x, logo_seq)) |>
map(unique) # Filter out identical sequences
names(logo_list_human_enz) <- c("Hydrolases",
"Isomerases",
"Ligases",
"Lyases",
"Oxidoreductases",
"Transferases")
# Making logo plots for mouse enzyme classses
logo_plot_mouse_enz <- ggseqlogo(logo_list_mouse_enz,
ncol=6) +
ggtitle("Mouse enzyme classes") +
theme(plot.title = element_text(hjust = 0.5,
size = 12),
strip.text = element_text(size = 8)) +
scale_x_continuous(breaks = 1:7, # Existing tick positions
labels = -3:3 # Existing tick labels
)
# Making logo plots for human enzyme classses
logo_plot_human_enz <- ggseqlogo(logo_list_human_enz,
ncol=6) +
ggtitle("Human enzyme classes") +
theme(plot.title = element_text(hjust = 0.5,
size = 12),
strip.text = element_text(size = 8)) +
scale_x_continuous(breaks = 1:7, # Existing tick positions
labels = -3:3 # Existing tick labels
)
# Combining mouse and human logo plots for enzyme classes
logo_comb_enz <- logo_plot_human_enz / logo_plot_mouse_enz
logo_comb_enz# Save combined logo plot
ggsave(filename = "../results/06_logoplot_enz.png",
logo_comb_enz)Location Analysis
Load data
library(tidyverse)
data_clean <- read_delim(file = "../data/03_dat_aug.tsv", delim = "\t")Analysis
We will look at the distribution of enzyme classes, molecular functions and pathways in the three most populated subcellular locations to see if they are differently distributed. These three locations are the cell membrane, the cytoplasm and the nucleus.
Enzyme class and subcellular location
There are 7 enzyme classes representing 7 types of enzyme catalyzed reactions. The distribution of enzyme classes is very similar between human and mice but there are some subtle differences between the three subcellular locations. There are more translocases in the cell membrane than in the other subcellular locations. The high relative number of translocases in the cell membrane makes great biological sense as they catalyze the transport of molecules and ions across membranes among other things.
data_clean |>
drop_na(enzyme_class, `subcellular_location_[cc]`) |>
select(entry, enzyme_class, organism, `subcellular_location_[cc]`) |>
distinct() |>
filter(`subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane")) |>
ggplot(aes(x = `subcellular_location_[cc]`,
color = enzyme_class,
fill = enzyme_class)) +
geom_bar(position = "fill") +
labs(title = "Enzyme classes in subcellular locations",
subtitle = "Distribution of the 7 enzyme classes in three subcellular locations",
x = "Subcellular location",
y = "Relative count",
color = "Enzyme class",
fill = "Enzyme class") +
theme_bw() +
theme(legend.position = "bottom",
legend.text = element_text(size = 8),
legend.title = element_text(size = 10),
plot.subtitle = element_text(size = 10)) +
guides(fill = guide_legend(nrow = 2, byrow = TRUE),
color = guide_legend(nrow = 2, byrow = TRUE)) +
facet_wrap(~organism)Warning: There was 1 warning in `filter()`.
ℹ In argument: ``subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell
membrane")`.
Caused by warning in `` `subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane") ``:
! longer object length is not a multiple of shorter object length
ggsave("07_location_analysis_EC.png",
path = "../results")Saving 7 x 5 in image
Molecular function in subcellular location
The molecular functions are very equally distributed in human and mice but there are differences between the three subcellular locations. There are many more DNA binding proteins in the nucleus where DNA is stored. There are more ATP binding proteins in the cell membrane and cytoplasm compared to the nucleus. There are also more identical protein binding proteins in the cell membrane i.e. it appears that homogeneous protein-protein interactions are more common in the cell membrane than in the two other subcellular locations.
# Vector with top 5 molecular functions
top5_GO_molecular_function <- data_clean |>
drop_na(GO_molecular_function) |>
select(entry, GO_molecular_function) |>
distinct() |>
group_by(GO_molecular_function) |>
summarize(count = n()) |>
arrange(desc(count)) |>
pull(GO_molecular_function) |>
head(5)# Plot of molecular function vs subcellular location
data_clean |>
drop_na(GO_molecular_function, `subcellular_location_[cc]`) |>
select(entry, GO_molecular_function, organism, `subcellular_location_[cc]`) |>
distinct() |>
filter(`subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane"),
GO_molecular_function == top5_GO_molecular_function) |>
ggplot(aes(x = `subcellular_location_[cc]`,
color = GO_molecular_function,
fill = GO_molecular_function)) +
geom_bar(position = "fill") +
labs(title = "Molecular functions in subcellular locations",
subtitle = "Distribution of the top 5 molecular functions in three subcellular locations",
x = "Subcellular location",
y = "Relative count",
color = "Molecular function",
fill = "Molecular function") +
theme_bw() +
theme(legend.position = "bottom",
legend.text = element_text(size = 7),
legend.title = element_text(size = 10),
plot.subtitle = element_text(size = 10)) +
guides(fill = guide_legend(nrow = 2, byrow = TRUE),
color = guide_legend(nrow = 2, byrow = TRUE)) +
facet_wrap(~organism)Warning: There was 1 warning in `filter()`.
ℹ In argument: `GO_molecular_function == top5_GO_molecular_function`.
Caused by warning in `GO_molecular_function == top5_GO_molecular_function`:
! longer object length is not a multiple of shorter object length
ggsave("07_location_analysis_mol_func.png",
path = "../results")Saving 7 x 5 in image
Pathways in subcellular location
Compared to the molecular functions and enzyme classes, there is a greater difference between the pathways in human and mice. However, there is also less data about the pathways in the three subcellular locations which might be the reason behind this apparent difference between organisms. In human, proteins involved in amino acid biosynthesis and degradation are found in the cytoplasm but none of these are found in mice. Mice obviously also have amino acid metabolism which ought to take place in the cytoplasm like it does in human. Greater annotation of the proteins is needed to be able to compare the pathways in human and mice.
# Vector with top 10 pathways
top10_pathway <- data_clean |>
drop_na(pathway) |>
select(entry, pathway) |>
distinct() |>
group_by(pathway) |>
summarize(count = n()) |>
arrange(desc(count)) |>
pull(pathway) |>
head(10)# Plot of pathway vs subcellular location
data_clean |>
drop_na(pathway, `subcellular_location_[cc]`) |>
select(entry, pathway, organism, `subcellular_location_[cc]`) |>
distinct() |>
filter(`subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane"),
pathway == top10_pathway) |>
ggplot(aes(x = `subcellular_location_[cc]`,
color = pathway,
fill = pathway)) +
geom_bar() +
labs(title = "Pathways in subcellular locations",
subtitle = "Distribution of the top 10 pathways in three subcellular locations",
x = "Subcellular location",
y = "Count",
color = "Pathway",
fill = "Pathway") +
theme_bw() +
theme(legend.position = "bottom",
legend.text = element_text(size = 5),
legend.title = element_text(size = 9),
plot.subtitle = element_text(size = 10)) +
guides(fill = guide_legend(nrow = 2, byrow = TRUE),
color = guide_legend(nrow = 2, byrow = TRUE)) +
facet_wrap(~organism)Warning: There were 2 warnings in `filter()`.
The first warning was:
ℹ In argument: ``subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell
membrane")`.
Caused by warning in `` `subcellular_location_[cc]` == c("nucleus", "cytoplasm", "cell membrane") ``:
! longer object length is not a multiple of shorter object length
ℹ Run `dplyr::last_dplyr_warnings()` to see the 1 remaining warning.
ggsave("07_location_analysis_pathways.png",
path = "../results")Saving 7 x 5 in image